import requests

from spider_utils.csv_utils import img_data_to_csv
from spider_utils.img_utils import get_img_mes
from bs4 import BeautifulSoup

from spider_utils.tag_utils import get_tags

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/89.0.4389.114 Safari/537.36 '
}

session = requests.session()

min_page = 1
max_page = 57
baseURL = 'https://www.fulitu.cc/'

# 源站点
src_site = 'https://www.fulitu.cc'

# 获取每个图片url等信息
img_data_list = []

i = 1

for page in range(min_page, max_page+1):

    print(i)
    i = i+1

    page_url_list = []

    url = baseURL + '/page/' + str(page)

    r = session.request('GET', url, headers=headers)

    soup = BeautifulSoup(r.text, "html.parser")
    a_tags = soup.select(".item-link")
    for a_tag in a_tags:
        page_url_list.append(a_tag["href"])
    # 逐个页面遍历
    for url in page_url_list:
        r = session.request('GET', url, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        img_tags = soup.select(".post-item-img")
        for img_tag in img_tags:
            try:
                img_data = {}
                img_data['src_site'] = src_site
                img_data['tags'] = ','.join([img_tag.string for img_tag in soup.select(".post-tags a")])
                img_data['src'] = "http:" + img_tag["data-original"]
                img_data['outer_net'] = 0
                # 获取其他的一些数据
                orther_mes = get_img_mes(img_data['src'])
                img_data['colors'] = orther_mes['colors']
                img_data['size'] = orther_mes['size']
                img_data_list.append(img_data)
            except:
                print("err!")
                continue

img_data_to_csv(img_data_list, img_data_list[0].keys(), '../data/fulitu_data.csv')